Set-up packages // libraries and prepare for data import:

## Uncomment install.packages and run outside of notebook environment: 
## install.packages(c("psych" ,"xtable", "tidyverse", "jsonlite", "likert", "ggplot2", "ploty", "mosaic", "modelr", "broom"))
library(psych)
library(likert)
Loading required package: ggplot2

Attaching package: ‘ggplot2’

The following objects are masked from ‘package:psych’:

    %+%, alpha

Loading required package: xtable
library(jsonlite)
library(ggplot2)
library(plotly)

Attaching package: ‘plotly’

The following object is masked from ‘package:ggplot2’:

    last_plot

The following object is masked from ‘package:stats’:

    filter

The following object is masked from ‘package:graphics’:

    layout
library(modelr)

Attaching package: ‘modelr’

The following object is masked from ‘package:psych’:

    heights
library(broom)

Attaching package: ‘broom’

The following object is masked from ‘package:modelr’:

    bootstrap
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
theme_set(theme_classic())

Set-up directories, import and clean data:

rm(list=ls())
setwd("/Users/allieblaising/desktop/bang/R") 
getwd()
[1] "/Users/allieblaising/Desktop/bang/R"
dataPath = "../.data"
## Define function to extract survey results: 
extractSurvey = function(frame,survey) {
  rounds = seq(1,length(frame$results.format[[1]]))
  roundResponses = lapply(rounds, function(round) {
    getCol = paste("results.",survey,".",round, sep="")
    surveyCols = Filter(function(x) grepl(getCol,x),names(frame))
    newCols = lapply(surveyCols, function(x) gsub(getCol,paste("results.",survey, sep=""),x) )
    surveyFrame = frame[,surveyCols]
    if (is.null(newCols)) {return("No newCols")}
    names(surveyFrame) = newCols
    surveyFrame$id = frame$id
    surveyFrame$round = round
    surveyFrame$batch = frame$batch
    surveyFrame$rooms = frame$rooms
    surveyFrame$manipulation = frame$results.manipulationCheck
    surveyFrame$blacklist = frame$results.blacklistCheck
    return(surveyFrame)
  })
  return(Reduce(rbind, roundResponses))
}
#Find directory for import (be sure to verify that batch #s align from bangData import and imports below): 
batches = dir(dataPath, pattern = "^[0-9]+$" )
completeBatches = Filter(function(batch) { 
  if (any(dir(paste(dataPath,batch,sep="/")) == "batch.json") && (any(dir(paste(dataPath,batch,sep="/")) == "users.json")) ) {
    batchData = read_json(paste(dataPath,batch,"batch.json",sep="/"), simplifyVector = TRUE)
    return(any(batchData$batchComplete == TRUE))
  } 
  return(FALSE)
}, batches)
userFiles = lapply(completeBatches, function(batch) {
  userFile = read_json(paste(dataPath,batch,"users.json",sep="/"), simplifyVector=TRUE)
  return(flatten(userFile, recursive = TRUE))
})
## Retroactively find rooms from chat data: 
overlappingFiles = Reduce(function(x,y) merge(x, y, all=TRUE), userFiles)
roundsWithRooms = apply(overlappingFiles,1,function(x) {
  roomsForIndividual = lapply(seq(1,3),function(y) {
    x$room = x$rooms[y]
    x$round = y
    return(x)
  })
  return(Reduce(rbind, roomsForIndividual))
})
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ tibble  1.4.2     ✔ purrr   0.2.5
✔ tidyr   0.8.1     ✔ dplyr   0.7.6
✔ readr   1.1.1     ✔ stringr 1.3.1
✔ tibble  1.4.2     ✔ forcats 0.3.0
── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ ggplot2::%+%()     masks psych::%+%()
✖ ggplot2::alpha()   masks psych::alpha()
✖ broom::bootstrap() masks modelr::bootstrap()
✖ dplyr::filter()    masks plotly::filter(), stats::filter()
✖ purrr::flatten()   masks jsonlite::flatten()
✖ dplyr::lag()       masks stats::lag()
✖ dplyr::recode()    masks likert::recode()
library(mosaic)
Loading required package: lattice
Loading required package: ggformula

New to ggformula?  Try the tutorials: 
    learnr::run_tutorial("introduction", package = "ggformula")
    learnr::run_tutorial("refining", package = "ggformula")

Attaching package: ‘ggformula’

The following object is masked from ‘package:modelr’:

    na.warn

Loading required package: mosaicData
Loading required package: Matrix

Attaching package: ‘Matrix’

The following object is masked from ‘package:tidyr’:

    expand


The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.

Attaching package: ‘mosaic’

The following object is masked from ‘package:Matrix’:

    mean

The following objects are masked from ‘package:dplyr’:

    count, do, tally

The following object is masked from ‘package:purrr’:

    cross

The following object is masked from ‘package:modelr’:

    resample

The following object is masked from ‘package:plotly’:

    do

The following object is masked from ‘package:ggplot2’:

    stat

The following objects are masked from ‘package:psych’:

    logit, read.file, rescale

The following objects are masked from ‘package:stats’:

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test, quantile, sd, t.test, var

The following objects are masked from ‘package:base’:

    max, mean, min, prod, range, sample, sum
## To filter the right batch numbers, add the first batch number for our runs (i.e. batch >= "first batch #") 
## overlappingFiles <- overlappingFiles %>% filter(batch=="1536132233074")

More cleaning before visualizations:

## Apply extract survey function to extract the right columns and rows for viability survey: 
# overlappingFiles <- overlappingFiles %>% filter(batch>="1536276904547") 
# qFifteen <- as.data.frame(extractSurvey(overlappingFiles, 'qFifteenCheck'))
# qSixteen <- as.data.frame(extractSurvey(overlappingFiles, 'qSixteenCheck'))
# viabilitySurvey <- as.data.frame(extractSurvey(overlappingFiles, 'viabilityCheck'))  
# qFifteen <- qFifteen %>% select(id, results.qFifteenCheck)
# qSixteen <- qSixteen %>% select(id, results.qSixteenCheck) 
# qFifteen$id <-  unlist(qFifteen$id)
# qSixteen$id <-  unlist(qSixteen$id)
# postSurveyQs <- cbind(qFifteen, qSixteen) 
# frame <- cbind(viabilitySurvey, postSurveyQs)  
# frame <- frame[, !duplicated(colnames(frame))]
frame <- extractSurvey(overlappingFiles, 'viabilityCheck')
## Reduce to vertically combine rows in roundwithRooms list:  
finalRounds = as.data.frame(Reduce(rbind,roundsWithRooms))
## Subset incomplete cases from viability survey dataframe, use manipulation check b/c required for complete observation: 
data <- frame[frame$manipulation!="",]
## Rename room to rooms so that both are retained in future merge: 
data <- rename(data, rooms = "rooms")
## Subset incomplete cases for final rounds dataframe: 
data2 <- finalRounds[finalRounds$results.manipulationCheck!="", ]
## Select only variables of interest from final rounds: 
data2 = data2 %>% select(id, batch, room, bonus, name, friends, 
                         friends_history, results.condition, results.format,
                  results.manipulation,results.manipulationCheck,results.blacklistCheck, round)
## Convert to compatible data types before merge (**this should be simplified**)
data2$batch <- unlist(data2$batch)
data2$round <- unlist(data2$round)
data2$id <- unlist(data2$id) 
data$batch <- unlist(data$batch)
## Before merge, data and data2 should have the same # of observations
## Merge columns by id, round and batch #s: 
data <- left_join(data, data2, by=NULL)
Joining, by = c("id", "round", "batch")
## Subset only observations with batch #s in complete batches 
allConditions <- data[data$batch %in% completeBatches, ]

Conditionally assign conditions based on treatment and results column:

## Messy, but robust? Verify, verify, verify people:   
data <- data %>% mutate(
  condition = case_when(
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
    results.condition=='baseline' & results.format=="1:3" & round==1 ~ "A" ,
    results.condition=='baseline' & results.format=="1:3" & round==2 ~ "B" ,
    results.condition=='baseline' & results.format=="1:3" & round==3 ~ "C" 
  )) 

Set-up for factors for viability questions:

data <- rename(data, "repeatTeam" = results.viabilityCheck.15)
## Remove observations where viability survey wasn't on (remove this line if we want to keep observations with viability off): 
data <- na.omit(data)
## Factor for visualizations: 
levels <- c("Strongly Disagree", "Disagree", "Neutral","Agree", "Strongly Agree") 
clean <- data %>% 
  mutate_at(.vars = vars(contains("results.viabilityCheck")), funs(factor(., levels = levels))) 
## Create a new dataframe that converts factors to numeric for statistical analyses:
stats <- clean %>% mutate_if(is.factor, as.numeric)
for (i in 1:nrow(stats)) {
  stats$sum[i] <- sum(stats[i,1:14])                          
} 
stats$median <- median(stats$sum)
stats$mean <- mean(stats$sum)
## Revalue repeat team: keep plyr b/c some weird R stuff requires library to be called directly (recode values to )
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Yes"="0", "No"="1"))
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Keep this team"="0", "Do not keep this team"="1"))
## Convert to compatible classes for team grouping: 
stats$repeatTeam <- as.numeric(stats$repeatTeam)
stats$results.condition <- unlist(stats$results.condition)
stats$results.format <- as.character(stats$results.format)
stats$room <- unlist(stats$room)
## Dplyr to group teams and summarise variables for each group: group_by to find teams and summarise to compact individual results into group level results (i.e. one row of variable results per team)
groupedProportion <- stats %>%
  group_by(room, batch, round, condition, results.condition, results.format) %>%  ## to-do: add group ID in storage db. 
  summarise(n=n(), mean=mean(sum), median=median(sum),prop=sum(repeatTeam)/n, groupID=runif(1,0,2)) %>% 
## Filter out all teams with n=1 (i.e. a person in a one person team) 
  filter(n==2)
## Individual proportion: 
individualProportion <- stats %>% group_by(round, batch, room) %>% 
  mutate(sum=sum, mean=mean(sum), median=median(sum), n=n(),prop=sum(repeatTeam)/n) %>% 
  filter(n>1)
## Table showing how many teams we've run in each condition combination:  
print(table(groupedProportion$condition, groupedProportion$results.condition)) 
    
     baseline control treatment
  A        26      10        25
  Ap        0      10        25
  B        20      14        22
  C        29       0         0

Probability of fracture across teams and condition combinations:

  left_join(baselineFracture1, baselineFracture2, by=NULL, drop=TRUE)
Joining, by = c("room", "batch", "round", "condition", "results.condition", "results.format", "n", "mean", "median", "prop", "groupID", "fracture")

Section #1: Does fracture have continuity with prior measures?

1A: Report the % of correct guesses on the manipulation check:

## Treatment manipulation: 
treatmentManipulation <- cleanManipulation %>% group_by(id, results.condition, manipulationAnswers, manipulationAnswerKey) %>% filter(results.condition=="treatment") %>% 
  summarise(n=n()) 
## If user manipulation answers == manipulation answer key then add 1 
treatmentManipulation$correctAnswers <- sum(ifelse(treatmentManipulation$manipulationAnswers==treatmentManipulation$manipulationAnswerKey,1,0))
## Calculate percent of correct manipulation answers for treatment: 
treatmentManipulation <- treatmentManipulation %>% mutate(percentCorrect = (correctAnswers/(nrow(treatmentManipulation))))
## Control manipulation: 
controlManipulation <- cleanManipulation %>% group_by(id, results.condition, manipulationAnswers, manipulationAnswerKey) %>% filter(results.condition=="control") %>% 
  summarise(n=n()) 
## If user manipulation answers == manipulation answer key then add 1 
controlManipulation$correctAnswers <- sum(ifelse(controlManipulation$manipulationAnswers==controlManipulation$manipulationAnswerKey,1,0))
## Calculate percent of correct manipulation answers for control: 
controlManipulation <- controlManipulation %>% mutate(percentCorrect = (correctAnswers/(nrow(controlManipulation))))
## Proportion test comparing treatment manipulation percent correct with chance: 

1B: Proportion test comparing treatment manipulation percent correct with chance:

## ⅓ ~ 33% = chance, ~43 = treatment (FILL-IN EACH TIME YOU RUN WITH NEW DATA!)
propManipulation <- prop.test(x = c(33, 43), n = c(100, 100))
print(propManipulation)

    2-sample test for equality of proportions with continuity correction

data:  c(33, 43) out of c(100, 100)
X-squared = 1.719, df = 1, p-value = 0.1898
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.24382407  0.04382407
sample estimates:
prop 1 prop 2 
  0.33   0.43 

1C: Logistic regression predicting binary fracture outcome from viability scales:

## Split data into 60% training and 40% testing data sets to test how well the model performs 
set.seed(123)
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture) 
sample <- sample(c(TRUE, FALSE), nrow(groupedProportionFracture), replace = T, prob = c(0.6,0.4))
train <- groupedProportionFracture[sample, ]
test <- groupedProportionFracture[!sample, ]
## Simple logistic regression: we will fit a logistic regression model in order to predict 
## the probability of fracture based on a team's average viability sum: 
model1 <- glm(fracture ~ mean, family = "binomial", data = train)
summary(model1)

Call:
glm(formula = fracture ~ mean, family = "binomial", data = train)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.94811  -0.46471   0.04798   0.38542   2.43161  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) 13.79411    2.58357   5.339 9.34e-08 ***
mean        -0.26503    0.04908  -5.400 6.68e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 152.165  on 109  degrees of freedom
Residual deviance:  71.765  on 108  degrees of freedom
AIC: 75.765

Number of Fisher Scoring iterations: 6
## To assess the linear regression deviance, look at deviance in summary output, if 
## deviance == sum of sqaures in linear regression, null deviance == difference between 
## a model with only the intercept ("no mean predictors") and the a saturated model 
## a model with a theoretically perfect fit. Model deviance (residual deviance) should be lower 
## small values == better fit. 
tidy(model1)
## Coefficient estimtes from log regression characterize relationship between the predictor and 
## response variable on a log-odds scale, so binary increase from no fracture - fracture can be interpreted as associated with a decrease in mean viability sum. 
## More coefficient output: measure the confidence intervals and accuracy of the coefficent: 
confint(model1)
Waiting for profiling to be done...
                 2.5 %     97.5 %
(Intercept)  9.3617399 19.6211689
mean        -0.3757527 -0.1807974
## Making predictions: 
## What is the probability of fracture given the following team mean viability scores: for example purposes: 40 and 65: 
predict(model1, data.frame(mean = c(50, 65)), type = "response")
         1          2 
0.63238720 0.03127928 
## From the output, we can see that the probability of fracture decreases by ~40% when mean viability sum increases from 40 to 65. 
## Model evaluation & diagnostics: 
## How well does the model fit the data? And how accurate are the predictions on an out-of-sample data set?
## Residul assessment: 
model1_data <- augment(model1) %>% 
  mutate(index = 1:n())
ggplot(model1_data, aes(index, .std.resid, color = mean)) + 
  geom_point(alpha = .5) +
  geom_ref_line(h = 3)

## Validation of predicted values: 
## How well does the model perform when predicting the target variable on out-of-sample observations? 
test.predicted.m1 <- predict(model1, newdata = test, type = "response")
## Classification performance for each model on the test data. Output gives us a list of true / false positives: 
list(
  model1 = table(test$mean, test.predicted.m1 > 0.5) %>% prop.table() %>% round(3)) 
$model1
      
       FALSE  TRUE
  32   0.000 0.014
  32.5 0.000 0.014
  36   0.000 0.028
  36.5 0.000 0.014
  37   0.000 0.014
  41.5 0.000 0.028
  44   0.000 0.014
  45   0.000 0.014
  46.5 0.000 0.028
  47   0.000 0.014
  48.5 0.000 0.042
  49   0.000 0.028
  49.5 0.000 0.014
  50   0.000 0.028
  50.5 0.000 0.014
  52   0.000 0.028
  52.5 0.014 0.000
  53   0.028 0.000
  54   0.028 0.000
  54.5 0.014 0.000
  55.5 0.028 0.000
  56   0.099 0.000
  56.5 0.014 0.000
  57   0.028 0.000
  58   0.028 0.000
  58.5 0.042 0.000
  59   0.014 0.000
  59.5 0.014 0.000
  60   0.014 0.000
  60.5 0.014 0.000
  61   0.014 0.000
  61.5 0.028 0.000
  62   0.028 0.000
  62.5 0.014 0.000
  63   0.056 0.000
  64   0.014 0.000
  65   0.014 0.000
  66   0.014 0.000
  66.5 0.014 0.000
  69   0.014 0.000
  70   0.070 0.000
table(test$mean, test.predicted.m1 > 0.5)
      
       FALSE TRUE
  32       0    1
  32.5     0    1
  36       0    2
  36.5     0    1
  37       0    1
  41.5     0    2
  44       0    1
  45       0    1
  46.5     0    2
  47       0    1
  48.5     0    3
  49       0    2
  49.5     0    1
  50       0    2
  50.5     0    1
  52       0    2
  52.5     1    0
  53       2    0
  54       2    0
  54.5     1    0
  55.5     2    0
  56       7    0
  56.5     1    0
  57       2    0
  58       2    0
  58.5     3    0
  59       1    0
  59.5     1    0
  60       1    0
  60.5     1    0
  61       1    0
  61.5     2    0
  62       2    0
  62.5     1    0
  63       4    0
  64       1    0
  65       1    0
  66       1    0
  66.5     1    0
  69       1    0
  70       5    0

1D: Graph of fracture/no fracture vs. mean/stdev viability scale

## Overall: 
g <- ggplot(groupedProportionFracture, aes(factor(fracture), mean)) 
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team fracture value vs. mean viability score: fracture >=0.50 across all conditions (n=2)", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") 

## By condition + format: 
g <- ggplot(groupedProportionFracture, aes(factor(fracture), mean)) 
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team fracture value vs. mean viability score: fracture >=0.50 (n=2)", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.condition) 

ggplot(data=groupedProportionFracture, aes(fracture, mean)) + 
   geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of fracture value vs. mean for team viability sums across all conditions (n=2)") +  facet_grid(condition ~ results.condition) 

ggplot(data=groupedProportionFracture, aes(fracture, mean)) + 
   geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of fracture value vs. mean for team viability sums (n=2)") +  facet_grid(condition ~ results.condition) 

Section #2: How often does fracture occur?

2A: P(binary fracture) histogram of fracture proportion (by team)

## First make sure you've made ABC condition
ggplot(data=groupedProportion, aes(groupedProportion$prop)) + 
  geom_histogram(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Frequency of team fracture proportions by condition and pattern sequence", 
                                  x="team fracture proportion", y="Count") + facet_grid(results.condition ~ .) 

library(MASS)     

Attaching package: ‘MASS’

The following object is masked from ‘package:dplyr’:

    select

The following object is masked from ‘package:plotly’:

    select
## Test if whether fracture smoking habit is independent of condition at .05 significance level: 
chi = table(groupedProportionFracture$fracture, groupedProportionFracture$results.condition)  
chi
   
    baseline control treatment
  0       41      18        38
  1       34      16        34
print(chisq.test(chi)) 

    Pearson's Chi-squared test

data:  chi
X-squared = 0.059809, df = 2, p-value = 0.9705

2B: What’s the overall % of fracturing the second time? (by condition)

## Filtering second time only: 
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture)
## TO-DO: review below code, worried it's incorrect: 
groupedProportionFractureSecond <- groupedProportionFracture %>% group_by(results.condition) %>% 
      filter(condition=="Ap") %>% 
      mutate(overallPercent = sum(as.numeric(fracture))) %>% 
      summarise(n=n(), overallPercent=unique(overallPercent)/n) 
print(groupedProportionFractureSecond)
## For baseline theoretical distribution: (to-do: look over this, worried also that it's wrong)
groupedProportionFractureSecondBase <- groupedProportionFractureBaseline %>% 
      filter(round=="3") %>% 
      summarise(n=n(), overallPercent=sum(as.numeric(fracture))/n) 
print(mean(groupedProportionFractureSecondBase$overallPercent))
[1] 0.4930556
## Set-up separate groups for proportion tests to answer: does unmasked fracture more/less than masked 
## and more/less than new pairs? 
## Control: 
## Proportion tests for each (TO-DO, figure out argument stuff: this doesn't seem like the right test to run for the question we're asking?) also, fill-this in every time you get new data: 
## Is the proportion of fracture in the second round significantly different in the two conditions (i.e. treatment + baseline) 
treatmentvsBaselineChange <- c(48, 49)
propFractureChangeTreatmentvsBase <- prop.test(x = c(treatmentvsBaselineChange), n = c(100, 100))
print(propFractureChangeTreatmentvsBase) 

    2-sample test for equality of proportions with continuity correction

data:  c(treatmentvsBaselineChange) out of c(100, 100)
X-squared = 0, df = 1, p-value = 1
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.1585211  0.1385211
sample estimates:
prop 1 prop 2 
  0.48   0.49 
## Is the proportion of fracture in the second round significantly different in the two conditions (i.e. control + baseline) 
controlvsBaselineChange <- c(40, 49)
propFractureChangeControlvsBase <- prop.test(x = c(controlvsBaselineChange), n = c(100, 100))
print(propFractureChangeControlvsBase)

    2-sample test for equality of proportions with continuity correction

data:  c(controlvsBaselineChange) out of c(100, 100)
X-squared = 1.2957, df = 1, p-value = 0.255
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.23718348  0.05718348
sample estimates:
prop 1 prop 2 
  0.40   0.49 

Section 3: How consistent is fracture?

3A: calculating the conditional probabilities of fracture for each condition:

controlFracture12 <- sum(ifelse(controlFracture1$fracture=="1" & controlFracture2$fracture=="1",1,0))/nrow(controlFracture)
# of groups who fractured only 1
controlFractureOnly1 <- sum(ifelse(controlFracture1$fracture=="1" & controlFracture2$fracture=="0",1,0))/nrow(controlFracture) 
# of groups who fractured only 2
controlFractureOnly2 <- sum(ifelse(controlFracture1$fracture=="0" & controlFracture2$fracture=="1",1,0))/nrow(controlFracture)
# of groups who fractured never        
controlFractureNever <- sum(ifelse(controlFracture1$fracture=="0" & controlFracture2$fracture=="0",1,0))/nrow(controlFracture)
conditionalProbControl <- matrix(c(controlFracture12, controlFractureOnly1, controlFractureOnly2, controlFractureNever),ncol=2,byrow=TRUE)
colnames(conditionalProbControl) <- c("Ap frac","Ap no frac") 
rownames(conditionalProbTreatment) <- c("A frac","A no frac")
conditionalProbControl <- as.table(conditionalProbControl)
conditionalProbControl
  Ap frac Ap no frac
A     0.3        0.0
B     0.1        0.6

3B: Now investigating the big result: switch %: what’s the % of pairs flipping their decisions?

controlFractureAbsSum <- sum(controlFracture$absFracture)
print(controlFractureAbsSum)
[1] 1

Visualizing change:

---
title: "Asplode test outline"
output: html_notebook
---

## Set-up packages // libraries and prepare for data import: 
```{r}
## Uncomment install.packages and run outside of notebook environment: 
## install.packages(c("psych" ,"xtable", "tidyverse", "jsonlite", "likert", "ggplot2", "ploty", "mosaic", "modelr", "broom"))

library(psych)
library(likert)
library(jsonlite)
library(ggplot2)
library(plotly)
library(modelr)
library(broom)
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
theme_set(theme_classic())
```

## Set-up directories, import and clean data: 
```{r}
rm(list=ls())
setwd("/Users/allieblaising/desktop/bang/R") 
getwd()
dataPath = "../.data"
## Define function to extract survey results: 
extractSurvey = function(frame,survey) {
  rounds = seq(1,length(frame$results.format[[1]]))
  roundResponses = lapply(rounds, function(round) {
    getCol = paste("results.",survey,".",round, sep="")
    surveyCols = Filter(function(x) grepl(getCol,x),names(frame))
    newCols = lapply(surveyCols, function(x) gsub(getCol,paste("results.",survey, sep=""),x) )
    surveyFrame = frame[,surveyCols]
    if (is.null(newCols)) {return("No newCols")}
    names(surveyFrame) = newCols
    surveyFrame$id = frame$id
    surveyFrame$round = round
    surveyFrame$batch = frame$batch
    surveyFrame$rooms = frame$rooms
    surveyFrame$manipulation = frame$results.manipulationCheck
    surveyFrame$blacklist = frame$results.blacklistCheck
    return(surveyFrame)
  })
  return(Reduce(rbind, roundResponses))
}
#Find directory for import (be sure to verify that batch #s align from bangData import and imports below): 
batches = dir(dataPath, pattern = "^[0-9]+$" )
completeBatches = Filter(function(batch) { 
  if (any(dir(paste(dataPath,batch,sep="/")) == "batch.json") && (any(dir(paste(dataPath,batch,sep="/")) == "users.json")) ) {
    batchData = read_json(paste(dataPath,batch,"batch.json",sep="/"), simplifyVector = TRUE)
    return(any(batchData$batchComplete == TRUE))
  } 
  return(FALSE)
}, batches)
userFiles = lapply(completeBatches, function(batch) {
  userFile = read_json(paste(dataPath,batch,"users.json",sep="/"), simplifyVector=TRUE)
  return(flatten(userFile, recursive = TRUE))
})

## Retroactively find rooms from chat data: 
overlappingFiles = Reduce(function(x,y) merge(x, y, all=TRUE), userFiles)
roundsWithRooms = apply(overlappingFiles,1,function(x) {
  roomsForIndividual = lapply(seq(1,3),function(y) {
    x$room = x$rooms[y]
    x$round = y
    return(x)
  })
  return(Reduce(rbind, roomsForIndividual))
})
library(tidyverse)
library(mosaic)

## To filter the right batch numbers, add the first batch number for our runs (i.e. batch >= "first batch #")  
## overlappingFiles <- overlappingFiles %>% filter(batch=="1536132233074")

```

## More cleaning before visualizations: 
```{r}
## For when we start inputting new batches only: 
# overlappingFiles <- overlappingFiles %>% filter(batch>="1536276904547") 
# qFifteen <- as.data.frame(extractSurvey(overlappingFiles, 'qFifteenCheck'))
# qSixteen <- as.data.frame(extractSurvey(overlappingFiles, 'qSixteenCheck'))
# viabilitySurvey <- as.data.frame(extractSurvey(overlappingFiles, 'viabilityCheck'))  
# qFifteen <- qFifteen %>% select(id, results.qFifteenCheck)
# qSixteen <- qSixteen %>% select(id, results.qSixteenCheck) 
# qFifteen$id <-  unlist(qFifteen$id)
# qSixteen$id <-  unlist(qSixteen$id)
# postSurveyQs <- cbind(qFifteen, qSixteen) 
# frame <- cbind(viabilitySurvey, postSurveyQs)  
# frame <- frame[, !duplicated(colnames(frame))]

frame <- extractSurvey(overlappingFiles, 'viabilityCheck')
## Reduce to vertically combine rows in roundwithRooms list:  
finalRounds = as.data.frame(Reduce(rbind,roundsWithRooms))
## Subset incomplete cases from viability survey dataframe, use manipulation check b/c required for complete observation: 
data <- frame[frame$manipulation!="",]
## Rename room to rooms so that both are retained in future merge: 
data <- rename(data, rooms = "rooms")
## Subset incomplete cases for final rounds dataframe: 
data2 <- finalRounds[finalRounds$results.manipulationCheck!="", ]
## Select only variables of interest from final rounds: 
data2 = data2 %>% select(id, batch, room, bonus, name, friends, 
                         friends_history, results.condition, results.format,
                  results.manipulation,results.manipulationCheck,results.blacklistCheck, round)
## Convert to compatible data types before merge (**this should be simplified**)
data2$batch <- unlist(data2$batch)
data2$round <- unlist(data2$round)
data2$id <- unlist(data2$id) 
data$batch <- unlist(data$batch)

## Before merge, data and data2 should have the same # of observations
## Merge columns by id, round and batch #s: 
data <- left_join(data, data2, by=NULL)
## Subset only observations with batch #s in complete batches 
allConditions <- data[data$batch %in% completeBatches, ]
```
## Conditionally assign conditions based on treatment and results column: 
```{r}
## Messy, but robust? Verify, verify, verify people:   
data <- data %>% mutate(
  condition = case_when(
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
    results.condition=='baseline' & results.format=="1:3" & round==1 ~ "A" ,
    results.condition=='baseline' & results.format=="1:3" & round==2 ~ "B" ,
    results.condition=='baseline' & results.format=="1:3" & round==3 ~ "C" 
  )) 

```

## Set-up for factors for viability questions: 
```{r}
data <- rename(data, "repeatTeam" = results.viabilityCheck.15)
## Remove observations where viability survey wasn't on (remove this line if we want to keep observations with viability off): 
data <- na.omit(data)
## Factor for visualizations: 
levels <- c("Strongly Disagree", "Disagree", "Neutral","Agree", "Strongly Agree") 
clean <- data %>% 
  mutate_at(.vars = vars(contains("results.viabilityCheck")), funs(factor(., levels = levels))) 
## Create a new dataframe that converts factors to numeric for statistical analyses:
stats <- clean %>% mutate_if(is.factor, as.numeric)
for (i in 1:nrow(stats)) {
  stats$sum[i] <- sum(stats[i,1:14])                          
} 
stats$median <- median(stats$sum)
stats$mean <- mean(stats$sum)
## Revalue repeat team: keep plyr b/c some weird R stuff requires library to be called directly (recode values to )
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Yes"="0", "No"="1"))
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Keep this team"="0", "Do not keep this team"="1"))
## Convert to compatible classes for team grouping: 
stats$repeatTeam <- as.numeric(stats$repeatTeam)
stats$results.condition <- unlist(stats$results.condition)
stats$results.format <- as.character(stats$results.format)
stats$room <- unlist(stats$room)
## Dplyr to group teams and summarise variables for each group: group_by to find teams and summarise to compact individual results into group level results (i.e. one row of variable results per team)
groupedProportion <- stats %>%
  group_by(room, batch, round, condition, results.condition, results.format) %>%  ## to-do: add group ID in storage db. 
  summarise(n=n(), mean=mean(sum), median=median(sum),prop=sum(repeatTeam)/n, groupID=runif(1,0,2)) %>% 
## Filter out all teams with n=1 (i.e. a person in a one person team) 
  filter(n==2)

## Individual proportion: 
individualProportion <- stats %>% group_by(round, batch, room) %>% 
  mutate(sum=sum, mean=mean(sum), median=median(sum), n=n(),prop=sum(repeatTeam)/n) %>% 
  filter(n>1)

## Table showing how many teams we've run in each condition combination:  
print(table(groupedProportion$condition, groupedProportion$results.condition)) 
```

## Probability of fracture across teams and condition combinations: 
```{r}
## Define and initialize cut-off point for fracture: 
groupedProportionFracture <- groupedProportion %>%
  group_by(results.condition, condition) %>% 
  mutate(fracture = case_when (prop<.50 ~ "0", 
                                prop>=.50~ "1")) 

## Baseline: (we probably won't need this since we are artificially making baseline conditions)
# groupedProportionFractureBaseline <- 
#   groupedProportionFracture %>% 
# filter(results.condition=="baseline") %>% 
#   filter(id %in% condition=="A" & condition=="C")
# baselineFracture1 <- groupedProportionFractureBaseline %>% filter(condition=="A")
# baselineFracture2 <- groupedProportionFractureBaseline %>% filter(condition=="C")

# TO-DO: check if this is right--seems off (why is baseline off?)

groupedProportionFractureBaseline <- groupedProportionFracture %>% filter(results.format=="c(1, 1, 2)" | results.format=="c(2, 1, 1)") %>% filter(round=="1" | round=="3") 
baselineFracture1 <- groupedProportionFractureBaseline %>% filter(round=="1")
baselineFracture2 <- groupedProportionFractureBaseline %>% filter(round=="3")
baselineFracture1$fracture1 <- baselineFracture1$fracture
baselineFracture2$fracture2 <- baselineFracture2$fracture
## Below only combines if teams are in both round 1 & 3 together: 

## Treatment: 
groupedProportionFractureTreatment <- groupedProportionFracture %>% 
filter(results.condition=="treatment")
treatmentFracture1 <- groupedProportionFractureTreatment %>% filter(condition=="A")
treatmentFracture2 <- groupedProportionFractureTreatment %>% filter(condition=="Ap")
treatmentFracture <- cbind(treatmentFracture1, treatmentFracture2)

## Control:  
groupedProportionFractureControl <- groupedProportionFracture %>% 
  filter(results.condition=="control") 
controlFracture1 <- groupedProportionFractureControl %>% filter(condition=="A")
controlFracture2 <- groupedProportionFractureControl %>% filter(condition=="Ap")
controlFracture <- cbind(controlFracture1, controlFracture2)

## Manipulation check: 
## Use individual proportion data frame b/c we're interested in looking at individuals that were in teams >n=1
teamPatterns <- "Team 1 and Team 2|Team 1 and Team 3|Team 2 and Team 3"
individualProportion$manipulationAnswerKey <- str_extract(individualProportion$results.manipulationCheck,teamPatterns) 
individualProportion$manipulationAnswers <- individualProportion$results.manipulation
individualProportion$results.condition = unlist(individualProportion$results.condition)
## Omit NAs and results that can't be processed: 
cleanManipulation <- individualProportion[with(individualProportion, grepl(teamPatterns, manipulationAnswers) & grepl(teamPatterns, manipulationAnswerKey)),]
## Transform into compatiable form: 
cleanManipulation$manipulationAnswers = unlist(cleanManipulation$manipulationAnswers)
```

## Section #1: Does fracture have continuity with prior measures?

## 1A: Report the % of correct guesses on the manipulation check: 
```{r}
## Treatment manipulation: 
treatmentManipulation <- cleanManipulation %>% group_by(id, results.condition, manipulationAnswers, manipulationAnswerKey) %>% filter(results.condition=="treatment") %>% 
  summarise(n=n()) 
## If user manipulation answers == manipulation answer key then add 1 
treatmentManipulation$correctAnswers <- sum(ifelse(treatmentManipulation$manipulationAnswers==treatmentManipulation$manipulationAnswerKey,1,0))
## Calculate percent of correct manipulation answers for treatment: 
treatmentManipulation <- treatmentManipulation %>% mutate(percentCorrect = (correctAnswers/(nrow(treatmentManipulation))))

## Control manipulation: 
controlManipulation <- cleanManipulation %>% group_by(id, results.condition, manipulationAnswers, manipulationAnswerKey) %>% filter(results.condition=="control") %>% 
  summarise(n=n()) 
## If user manipulation answers == manipulation answer key then add 1 
controlManipulation$correctAnswers <- sum(ifelse(controlManipulation$manipulationAnswers==controlManipulation$manipulationAnswerKey,1,0))
## Calculate percent of correct manipulation answers for control: 
controlManipulation <- controlManipulation %>% mutate(percentCorrect = (correctAnswers/(nrow(controlManipulation))))
## Proportion test comparing treatment manipulation percent correct with chance: 
```

## 1B: Proportion test comparing treatment manipulation percent correct with chance: 
```{r}
## ⅓ ~ 33% = chance, ~43 = treatment (FILL-IN EACH TIME YOU RUN WITH NEW DATA!)

propManipulation <- prop.test(x = c(33, 43), n = c(100, 100))
print(propManipulation)
```

## 1C: Logistic regression predicting binary fracture outcome from viability scales: 
```{r}
## Split data into 60% training and 40% testing data sets to test how well the model performs 
set.seed(123)
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture) 
sample <- sample(c(TRUE, FALSE), nrow(groupedProportionFracture), replace = T, prob = c(0.6,0.4))
train <- groupedProportionFracture[sample, ]
test <- groupedProportionFracture[!sample, ]

## Simple logistic regression: we will fit a logistic regression model in order to predict 
## the probability of fracture based on a team's average viability sum: 

model1 <- glm(fracture ~ mean, family = "binomial", data = train)
summary(model1)

## To assess the linear regression deviance, look at deviance in summary output, if 
## deviance == sum of sqaures in linear regression, null deviance == difference between 
## a model with only the intercept ("no mean predictors") and the a saturated model 
## a model with a theoretically perfect fit. Model deviance (residual deviance) should be lower 
## small values == better fit. 

tidy(model1)
## Coefficient estimtes from log regression characterize relationship between the predictor and 
## response variable on a log-odds scale, so binary increase from no fracture - fracture can be interpreted as associated with a decrease in mean viability sum. 

## More coefficient output: measure the confidence intervals and accuracy of the coefficent: 
confint(model1)

## Making predictions: 
## What is the probability of fracture given the following team mean viability scores: for example purposes: 40 and 65: 

predict(model1, data.frame(mean = c(50, 65)), type = "response")

## From the output, we can see that the probability of fracture decreases by ~40% when mean viability sum increases from 40 to 65. 

## Model evaluation & diagnostics: 
## How well does the model fit the data? And how accurate are the predictions on an out-of-sample data set?

## Residul assessment: 

model1_data <- augment(model1) %>% 
  mutate(index = 1:n())

ggplot(model1_data, aes(index, .std.resid, color = mean)) + 
  geom_point(alpha = .5) +
  geom_ref_line(h = 3)

## Validation of predicted values: 
## How well does the model perform when predicting the target variable on out-of-sample observations? 

test.predicted.m1 <- predict(model1, newdata = test, type = "response")

## Classification performance for each model on the test data. Output gives us a list of true / false positives: 

list(
  model1 = table(test$mean, test.predicted.m1 > 0.5) %>% prop.table() %>% round(3)) 

table(test$mean, test.predicted.m1 > 0.5)
```
## 1D: Graph of fracture/no fracture vs. mean/stdev viability scale

```{r}
## Overall: 
g <- ggplot(groupedProportionFracture, aes(factor(fracture), mean)) 
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team fracture value vs. mean viability score: fracture >=0.50 across all conditions (n=2)", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") 

```

```{r}
## By condition + format: 
g <- ggplot(groupedProportionFracture, aes(factor(fracture), mean)) 
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team fracture value vs. mean viability score: fracture >=0.50 (n=2)", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.condition) 
```

```{r}
ggplot(data=groupedProportionFracture, aes(fracture, mean)) + 
   geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of fracture value vs. mean for team viability sums across all conditions (n=2)") +  facet_grid(condition ~ results.condition) 
```

```{r}
ggplot(data=groupedProportionFracture, aes(fracture, mean)) + 
   geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of fracture value vs. mean for team viability sums (n=2)") +  facet_grid(condition ~ results.condition) 
```

## Section #2: How often does fracture occur?

## 2A: P(binary fracture) histogram of fracture proportion (by team)
```{r}
## First make sure you've made ABC condition
ggplot(data=groupedProportion, aes(groupedProportion$prop)) + 
  geom_histogram(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Frequency of team fracture proportions by condition and pattern sequence", 
                                  x="team fracture proportion", y="Count") + facet_grid(results.condition ~ .) 
```
```{r}
ggplot(data=groupedProportion, aes(groupedProportion$prop)) + 
  geom_histogram(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Frequency of team fracture proportions by condition and pattern sequence", 
                                  x="team fracture proportion", y="Count") + facet_grid(results.condition ~ condition)  
```

```{r}
library(MASS)     

## Test if whether fracture smoking habit is independent of condition at .05 significance level: 

chi = table(groupedProportionFracture$fracture, groupedProportionFracture$results.condition)  
chi
print(chisq.test(chi)) 
```

## 2B: What's the overall % of fracturing the second time? (by condition)
```{r}
## Filtering second time only: 
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture)

## TO-DO: review below code, worried it's incorrect: 

groupedProportionFractureSecond <- groupedProportionFracture %>% group_by(results.condition) %>% 
      filter(condition=="Ap") %>% 
      mutate(overallPercent = sum(as.numeric(fracture))) %>% 
      summarise(n=n(), overallPercent=unique(overallPercent)/n) 
print(groupedProportionFractureSecond)

## For baseline theoretical distribution: (to-do: look over this, worried also that it's wrong)

groupedProportionFractureSecondBase <- groupedProportionFractureBaseline %>% 
      filter(round=="3") %>% 
      summarise(n=n(), overallPercent=sum(as.numeric(fracture))/n) 
print(mean(groupedProportionFractureSecondBase$overallPercent))

## Set-up separate groups for proportion tests to answer: does unmasked fracture more/less than masked, and more/less than new pairs? 

## Control: 
## Proportion tests for each (TO-DO, figure out argument stuff: this doesn't seem like the right test to run for the question we're asking?) also, fill-this in every time you get new data: 

## Is the proportion of fracture in the second round significantly different in the two conditions (i.e. treatment + baseline) 
treatmentvsBaselineChange <- c(48, 49)
propFractureChangeTreatmentvsBase <- prop.test(x = c(treatmentvsBaselineChange), n = c(100, 100))
print(propFractureChangeTreatmentvsBase) 

## Is the proportion of fracture in the second round significantly different in the two conditions (i.e. control + baseline) 
controlvsBaselineChange <- c(40, 49)
propFractureChangeControlvsBase <- prop.test(x = c(controlvsBaselineChange), n = c(100, 100))
print(propFractureChangeControlvsBase)
```
## Section 3: How consistent is fracture? 

## 3A: calculating the conditional probabilities of fracture for each condition: 
```{r}

## If fractured the first time, what's the % of fracturing the second time? look at 1 and Ap combination for % in second time: 

## Compare both treatment and control with baseline as theoretical distribution: 

## Conditional probability for masked: 
# of groups who fractured 1 and 2
treatmentFracture12 <- sum(ifelse(treatmentFracture1$fracture=="1" & treatmentFracture2$fracture=="1",1,0))/nrow(treatmentFracture)
# of groups who fractured only 1
treatmentFractureOnly1 <- sum(ifelse(treatmentFracture1$fracture=="1" & treatmentFracture2$fracture=="0",1,0))/nrow(treatmentFracture) 
# of groups who fractured only 2
treatmentFractureOnly2 <- sum(ifelse(treatmentFracture1$fracture=="0" & treatmentFracture2$fracture=="1",1,0))/nrow(treatmentFracture)
# of groups who fractured never        
treatmentFractureNever <- sum(ifelse(treatmentFracture1$fracture=="0" & treatmentFracture2$fracture=="0",1,0))/nrow(treatmentFracture)
conditionalProbTreatment <- matrix(c(treatmentFracture12, treatmentFractureOnly1, treatmentFractureOnly2, treatmentFractureNever),ncol=2,byrow=TRUE)
colnames(conditionalProbTreatment) <- c("Ap frac","Ap no frac") 
rownames(conditionalProbTreatment) <- c("A frac","A no frac")
conditionalProbTreatment <- as.table(conditionalProbTreatment)
conditionalProbTreatment

## Conditional proability for unmasked: 
# of groups who fractured 1 and 2
controlFracture12 <- sum(ifelse(controlFracture1$fracture=="1" & controlFracture2$fracture=="1",1,0))/nrow(controlFracture)
# of groups who fractured only 1
controlFractureOnly1 <- sum(ifelse(controlFracture1$fracture=="1" & controlFracture2$fracture=="0",1,0))/nrow(controlFracture) 
# of groups who fractured only 2
controlFractureOnly2 <- sum(ifelse(controlFracture1$fracture=="0" & controlFracture2$fracture=="1",1,0))/nrow(controlFracture)
# of groups who fractured never        
controlFractureNever <- sum(ifelse(controlFracture1$fracture=="0" & controlFracture2$fracture=="0",1,0))/nrow(controlFracture)
conditionalProbControl <- matrix(c(controlFracture12, controlFractureOnly1, controlFractureOnly2, controlFractureNever),ncol=2,byrow=TRUE)
colnames(conditionalProbControl) <- c("Ap frac","Ap no frac") 
rownames(conditionalProbTreatment) <- c("A frac","A no frac")
conditionalProbControl <- as.table(conditionalProbControl)
conditionalProbControl
```

## 3B: Now investigating the big result: switch %: what's the % of pairs flipping their decisions?

```{r}
## Convert format before merge: *this can be simplified*: 

treatmentFracture2$fracture <- as.numeric(treatmentFracture2$fracture) 
treatmentFracture1$fracture <- as.numeric(treatmentFracture1$fracture) 
treatmentFracture$absFracture <- abs(treatmentFracture2$fracture-treatmentFracture1$fracture)

controlFracture1$fracture <- as.numeric(controlFracture1$fracture)
controlFracture2$fracture <- as.numeric(controlFracture2$fracture)
controlFracture$absFracture <- abs(controlFracture2$fracture-controlFracture1$fracture)

# baselineFracture$fracture1 <- as.numeric(baselineFracture$fracture1) 
# baselineFracture$fracture2 <- as.numeric(baselineFracture$fracture2) 
# baselineFracture$absFracture <- abs(baselineFracture$fracture1-baselineFracture$fracture2)

## Results here seem fishy? Triple check my code for abs/sum? 

treatmentFractureAbsSum <- sum(treatmentFracture$absFracture) 
print(treatmentFractureAbsSum)
controlFractureAbsSum <- sum(controlFracture$absFracture)
print(controlFractureAbsSum)
## baselineFractureAbsSum <- sum(baselineFracture$absFracture) 

## Don't we want a proportion test here before adding / trying chi-squared? 

treatmentAbsChange=c() 
controlAbsChange=c()

## Using baseline as theoretical probability distribution 
## Treatment: 
chisq.test(treatmentAbsChange,p=baselineAbsChange)

## Control:
chisq.test(controlAbsChange,p=baselineAbsChange)
```

## Visualizing change: 

```{r}
## Visualize differences: 
normalizedTreatmentChange <- treatmentFractureAbsSum/nrow(treatmentFracture)
normalizedControlChange <- controlFractureAbsSum/nrow(controlFracture)

dat <- data.frame(changesPerCondition= factor(c("Normalized percentage of absolute fracture change in masked >=.50","Normalized percentage of absolute fracture change in unmasked >=.50"), levels=c("Normalized percentage of absolute fracture change in masked >=.50","Normalized percentage of absolute fracture change in unmasked >=.50")),
                  fractureValueSwitch = c(normalizedTreatmentChange, normalizedControlChange))

p <- ggplot(data=dat, aes(x=changesPerCondition, y=fractureValueSwitch)) +
  geom_bar(stat="identity")
p <- ggplotly(p)
p
```








